

**Set replication path to wherever the codes and datasets are
global replication "ADD path"

set seed 1234

use "${replication}\visa_level_data.dta", clear

/*
This dataset is constructed using proprietary data from Orbis and DnB Hoovers combined with I-129 form data from USCIS. Hence, it cannot be publicly posted. Please refer to the Readme file to see how the dataset was constructed and how to access the information.


This dataset has one observation per approved H-1B visa application between 2001 to 2014. Sample consists of firms that were matched to the corporate datasets as explained in Appendix A. Firms correspond to Global Ultimate Owners (GUOs) of the US firms that apply for the visas.

The following variables are included:

guo_name: anonymized identifier for GUO firm 
original_year: Fiscal year that the visa application was submitted
receiptfy: Grouped version of years -> 2003 (2001-2003), 2005 (2004-2006), 2008 (2007-2009), 2011 (2010-2012), 2014 (2013-2014)
basisforclassification: Whether visa application was for new employment, renewal, change of employers, other. 
jobtitle: occupation as reported in visa application 
isco3dig: occupation mapped to isco 3 digit codes 
iscodesc: Description of isco occupation 
countryofbirth: Country of birth of immigrant coming through the H-1B visa 
nationality: 2-digit code for country of birth 
wage_num: Wage as reported by H-1B visa 
h1b: Vector of ones, for aggregation 
new_employment: Indicator for new employment application 
renewal: Indicator for renewal application 
ind_dol: Industry group following NAICS and BEA classification.  14 industry groups.
source_country: Country where the MNE is headquartered
*/



*Generate Tables with counts per year

*[IN DRAFT] Appendix Table A2: Distribution of visa petitions by nationality and source country of MNEs
tab source_country_alt 
tab source_country_alt if receiptfy>=2001 &  receiptfy<=2011
tab source_country_alt if receiptfy>=2012 &  receiptfy<=2014

tab nationality
tab nationality if receiptfy>=2002 &  receiptfy<=2011
tab nationality if receiptfy>=2012 &  receiptfy<=2014

*[IN DRAFT] Appendix Table A3: Distribution of visa petitions by industry
tab ind_dol  
tab ind_dol if receiptfy>=2001 &  receiptfy<=2011
tab ind_dol if receiptfy>=2012 &  receiptfy<=2014





*Create aggregates at the firm level level
bysort guo_name receiptfy nationality: egen total_h1b=sum(h1b)
bysort guo_name receiptfy nationality: egen wbill_h1b=sum(wage_num)
bysort guo_name receiptfy nationality: egen total_h1b_new=sum(h1b*new_employment)
bysort guo_name receiptfy nationality: egen wbill_h1b_new=sum(wage_num*new_employment)


*Occupation data

gen managers=(isco3dig==12)
gen administrative=(isco3dig==343)
gen engineers=(isco3dig==214)

bysort receiptfy isco3dig: egen rank_occ_wage=xtile(wage_num), nq(4)
gen very_high_wage=(rank_occ_wage==4)

bysort guo_name receiptfy nationality: egen managers_avg=mean(managers)
bysort guo_name receiptfy nationality: egen admins_avg=mean(administrative)
bysort guo_name receiptfy nationality: egen engineers_avg=mean(engineers)
bysort guo_name receiptfy nationality: egen engineers_vhw_avg=mean(engineers*very_high_wage)

*Keep one observation per firm-year-nationality
duplicates drop guo_name receiptfy nationality, force

*Create variables in logs
gen ltotal_h1b=log(total_h1b)
gen lwbill_h1b=log(wbill_h1b)
gen lavg_wage_total=log(wbill_h1b/total_h1b)
gen avg_wage_total=wbill_h1b/total_h1b
gen ltotal_h1b_new=log(total_h1b_new)


*Variables for fixed effects and clustering
encode source_country, gen(source_country_numeric)
encode nationality, gen(nationality_numeric)
encode ind_dol, gen(industry)

egen nationality_industry=group(nationality_numeric industry)
egen source_industry=group(source_country_numeric industry)
egen nationality_time=group(nationality_numeric receiptfy)
egen nationality_industry_time=group(nationality_numeric industry receiptfy)
egen source_industry_time=group(source_country_numeric industry receiptfy)

gen firm_numeric=guo_name
egen firm_time=group(firm_numeric receiptfy)

egen clustervar2=group(nationality_numeric source_country_numeric)



*Sourcing indicators (MNE origin is the same as worker nationality)

gen sourcing=(source_country==nationality)

gen cpair_au=(source_country=="AU" & nationality=="AU")
gen cpair_ca=(source_country=="CA" & nationality=="CA")
gen cpair_ch=(source_country=="CH" & nationality=="CH")
gen cpair_cn=(source_country=="CN" & nationality=="CN")
gen cpair_de=(source_country=="DE" & nationality=="DE")
gen cpair_es=(source_country=="ES" & nationality=="ES")
gen cpair_fi=(source_country=="FI" & nationality=="FI")
gen cpair_fr=(source_country=="FR" & nationality=="FR")
gen cpair_gb=(source_country=="GB" & nationality=="GB")
gen cpair_ie=(source_country=="IE" & nationality=="IE")
gen cpair_il=(source_country=="IL" & nationality=="IL")
gen cpair_in=(source_country=="IN" & nationality=="IN")
gen cpair_jp=(source_country=="JP" & nationality=="JP")
gen cpair_kr=(source_country=="KR" & nationality=="KR")
gen cpair_nl=(source_country=="NL" & nationality=="NL")
gen cpair_se=(source_country=="SE" & nationality=="SE")
gen cpair_tw=(source_country=="TW" & nationality=="TW")




*Run home-bias regressions at the firm level, pooling all countries
quietly reghdfe ltotal_h1b sourcing if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
gen sample1=e(sample)
eststo m1


quietly reghdfe ltotal_h1b sourcing if nationality!="other" & source_country!="other" & total_h1b>1 & sample1==1, absorb(firm_time nationality_industry_time) vce(cluster clustervar2)
eststo m3

quietly reghdfe lavg_wage_total sourcing if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m4

quietly reghdfe lavg_wage_total sourcing if nationality!="other" & source_country!="other" & total_h1b>1, absorb(firm_time nationality_industry_time) vce(cluster clustervar2)
eststo m5



*[IN FINAL DRAFT] Columns 1, 2, 4, 5 of Appendix Table B6 "Home bias regressions for employment and wages"
esttab m1 m3 m4 m5, se nocons  star(* 0.10 ** 0.05  *** 0.01)




*[IN FINAL DRAFT]  Regression that creates Figure 1 "Estimated coefficient (gamma_s) by source country" - coefficient + 95% confidence interval
reghdfe ltotal_h1b cpair* if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)



*Analysis by industry


gen ind_sourcing_finance=(source_country==nationality)*(industry==1)
gen ind_sourcing_information=(source_country==nationality)*(industry==2)
gen ind_sourcing_chemicals=(source_country==nationality)*(industry==3)
gen ind_sourcing_computers=(source_country==nationality)*(industry==4)
gen ind_sourcing_electrical=(source_country==nationality)*(industry==5)
gen ind_sourcing_food=(source_country==nationality)*(industry==6)
gen ind_sourcing_machinery=(source_country==nationality)*(industry==7)
gen ind_sourcing_metals=(source_country==nationality)*(industry==8)
gen ind_sourcing_transportation=(source_country==nationality)*(industry==9)
gen ind_sourcing_other=(source_country==nationality)*(industry==10)
gen ind_sourcing_professionalsvcs=(source_country==nationality)*(industry==11)
gen ind_sourcing_realestate=(source_country==nationality)*(industry==12)
gen ind_sourcing_retail=(source_country==nationality)*(industry==13)
gen ind_sourcing_wholesale=(source_country==nationality)*(industry==14)



*[IN FINAL DRAFT]  Regression that creates Figure B1 " Estimated coefficient (gamma_k) by industry" - coefficient + 95% confidence interval
reghdfe ltotal_h1b ind_sourcing_finance-ind_sourcing_wholesale if nationality!="other" & source_country!="other" & total_h1b>1  & sample1==1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)




****************Occupation Analysis**********************

*[IN FINAL DRAFT]  Reported means in Table B11 "Dependent variable: Share of home-country immigrants in specific occupations" 
sum managers_avg if nationality!="other" & source_country!="other" & total_h1b>1
sum admins_avg if nationality!="other" & source_country!="other" & total_h1b>1
sum engineers_avg if nationality!="other" & source_country!="other" & total_h1b>1
sum engineers_vhw_avg if nationality!="other" & source_country!="other" & total_h1b>1



quietly reghdfe managers_avg sourcing if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m1
quietly reghdfe admins_avg sourcing if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m2
quietly reghdfe engineers_avg sourcing if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m3
quietly reghdfe engineers_vhw_avg sourcing if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m4


*[IN FINAL DRAFT]  Table B11 Regression coefficients in "Dependent variable: Share of home-country immigrants in specific occupations" 
esttab m2 m1 m3 m4, se nocons  star(* 0.10 ** 0.05  *** 0.01)




*[IN FINAL DRAFT]  Figure B4 "Home bias robustness: occupations" - coefficient + 95% confidence interval
  
*Figure B4a
reghdfe admins_avg cpair* if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
*Figure B4b
reghdfe managers_avg cpair* if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
*Figure B4c
reghdfe engineers_avg cpair* if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)




***********Analysis by application size and time since first application************



*First assign to each firm an application size bin and year since application*

preserve

use "${replication}\visa_level_data.dta", clear

gen sourcing_temp=(source_country==nationality)
bysort guo_name: egen avg_visas=sum(h1b)
bysort guo_name: egen avg_visas_oth=sum(h1b*(1-sourcing_temp))


*Keep data at the firm-year level
duplicates drop guo_name receiptfy, force

*Count periods since first application
bysort guo_name: egen year_rank=rank(receiptfy), track


*Calculate average visas applied for 
bysort guo_name: egen tot_years=max(year_rank)
gen average_visas=avg_visas/tot_years
gen average_visas_oth=avg_visas_oth/tot_years

*Calculate visa application quartiles
xtile nvisas_rank=average_visas, nq(4)
xtile nvisas_oth_rank=average_visas_oth, nq(4)

keep guo_name receiptfy year_rank nvisas_*

*Save dataset that will be re-merged to master data
tempfile year_rank
save `year_rank', replace

restore

*Merge dataset that contains application quartile and periods since first application
merge m:1 guo_name receiptfy using `year_rank'
drop _m



**Analysis of home bias by H-1B application size


*Create sourcing indicators interacted with size bin


forval x=1/4 {

gen sourcing_nvisas_oth_`x'=(sourcing==1 & nvisas_oth_rank==`x')
gen nvisas_oth_rank`x'=(nvisas_oth_rank==`x')
gen nvisas_rank`x'=(nvisas_rank==`x')

}


quietly reghdfe ltotal_h1b nvisas_oth_rank2 nvisas_oth_rank3 nvisas_oth_rank4 sourcing sourcing_nvisas_oth_2 sourcing_nvisas_oth_3 sourcing_nvisas_oth_4 if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m1
quietly reghdfe ltotal_h1b nvisas_oth_rank2 nvisas_oth_rank3 nvisas_oth_rank4 sourcing  sourcing_nvisas_oth_2 sourcing_nvisas_oth_3 sourcing_nvisas_oth_4 if nationality!="other" & source_country!="other" & source_country!="IN" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m2
quietly reghdfe total_h1b nvisas_oth_rank2 nvisas_oth_rank3 nvisas_oth_rank4  sourcing sourcing_nvisas_oth_2 sourcing_nvisas_oth_3 sourcing_nvisas_oth_4 if nationality!="other" & source_country!="other" & source_country!="IN" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m3

*[IN FINAL DRAFT]  Table B8 "Home bias by application size quartile"
esttab m1 m2 m3, se nocons star(* 0.10 ** 0.05  *** 0.01) keep(sourcing*)



*Analysis by source country
gen spo50_oth_sourcing=(sourcing==1 & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_au=(source_country=="AU" & nationality=="AU" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_ca=(source_country=="CA" & nationality=="CA" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_ch=(source_country=="CH" & nationality=="CH" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_cn=(source_country=="CN" & nationality=="CN" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_de=(source_country=="DE" & nationality=="DE" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_es=(source_country=="ES" & nationality=="ES" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_fi=(source_country=="FI" & nationality=="FI" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_fr=(source_country=="FR" & nationality=="FR" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_gb=(source_country=="GB" & nationality=="GB" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_ie=(source_country=="IE" & nationality=="IE" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_il=(source_country=="IL" & nationality=="IL" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_in=(source_country=="IN" & nationality=="IN" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_jp=(source_country=="JP" & nationality=="JP" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_kr=(source_country=="KR" & nationality=="KR" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_nl=(source_country=="NL" & nationality=="NL" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_se=(source_country=="SE" & nationality=="SE" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))
gen spo50_oth_cpair_tw=(source_country=="TW" & nationality=="TW" & (nvisas_oth_rank==3 | nvisas_oth_rank==4))

gen size_control=(nvisas_oth_rank==3 | nvisas_oth_rank==4)


*[IN FINAL DRAFT]  Figure B3 "Home bias difference for large applicants" - coefficients for spo50_oth_cpair and their confidence intervals reported in Figure B3.
reghdfe ltotal_h1b size_control cpair* spo50_oth_cpair* if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)





***********Analysis using hiring over time**************

gen years_since=year_rank

*Some countries have many firms that just show up hiring for H-1Bs one year and never again
gen high_attrition=(source_country_alt=="CN" | source_country_alt=="ES" | source_country_alt=="AU" | source_country_alt=="IL" | source_country_alt=="KR" | source_country_alt=="other")


*Create sourcing interactions with years since first hiring
forval x=1/5 {
gen yr_sourcingfor`x'=(sourcing==1 & years_since==`x')
}




*Determine which firms are observed hiring for the first time after 2003
gen start_after_03=(years_since==1 & receiptfy>2003)
bysort guo_name: egen start_after_2003=max(start_after_03)


quietly reghdfe ltotal_h1b sourcing yr_sourcingfor2 yr_sourcingfor3 yr_sourcingfor4 if nationality!="other" & source_country!="other" & total_h1b>1 & start_after_2003==1, absorb(years_since source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m3
quietly reghdfe ltotal_h1b sourcing yr_sourcingfor2 yr_sourcingfor3 yr_sourcingfor4 if nationality!="other" & source_country!="other" & total_h1b>1 & source_country!="IN" & start_after_2003==1, absorb(years_since source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m4
quietly reghdfe ltotal_h1b sourcing yr_sourcingfor2 yr_sourcingfor3 yr_sourcingfor4 if nationality!="other" & source_country!="other" & total_h1b>1 & source_country!="IN" & high_attrition!=1 & start_after_2003==1, absorb(years_since source_industry_time nationality_industry_time) vce(cluster clustervar2)
eststo m5

*[IN FINAL DRAFT]  Table B10 "Home bias by firm-level characteristics"
esttab m3 m4 m5, se nocons  star(* 0.10 ** 0.05  *** 0.01)




***********Table with count of firms*************

preserve

gen large_applicant=(nvisas_rank==3 | nvisas_rank==4)
bysort guo_name: egen n_period=nvals(receiptfy)
gen hires_every_period=(n_period==5)

bysort guo_name: egen avg_sourcing=mean(sourcing)

gen hire_only_source=(avg_sourcing==1)
gen hire_only_nonsource=(avg_sourcing==0)
gen hire_all=(avg_sourcing>0 & avg_sourcing<1)



duplicates drop guo_name, force

table source_country_alt, stat(count receiptfy) stat(sum large_applicant) stat(sum hires_every_period) stat(sum hire_all) stat(sum hire_only_nonsource) stat(sum hire_only_source)

restore







**********Correlates between home-bias and observables********************


encode ind_dol, generate(industry_numeric)

*Run regression from equation 2
reghdfe ltotal_h1b i.industry_numeric##cpair* if nationality!="other" & source_country!="other" & total_h1b>1, absorb(source_industry_time nationality_industry_time) vce(cluster clustervar2)

*I copy to excel this regression and use this to calculate industry-country home bias to be imported in next step
*Add each cpair fixed effect to the corresponding industry#country fixed effect to obtain the country-industry home bias parameter. Then save the home-bias parameters.

*Excel file with calculation of home bias coefficients can be found in file "ADD Excel name.xlsx"



*Next I open a file with the estimated home-bias coefficient for employment and wages and source country, industry, and socure-industry level characteristics. The regression for wages is run below under the wage analysis. The full file with the coefficients and the characteristics is "${replication}\source_industry_characteristics.dta"

preserve

use "${replication}\source_industry_characteristics.dta", clear


/*
Definition of variables:


home_bias - coefficients of home bias estimated in equation 2
home_bias_wage - coefficients of home bias estimated in equation 21
gdpperworker - GDP per worker of source country (source:World Bank)
non_english - Indicator that English is not the country's primary language (source:CEPII)
distw  - Population weighted distance from the US to country s (source:CEPII)
total_output - Industry GDP at s (source: WIOT)
export_share_toUS - Share of US imports from s in k=Total US imports from country s in industry k/Total US imports in industry k (source: WIOT)
comparative_advantage2 - Share of exports by country s in industry k/Exports of country s as a share of total World exports (source: WIOT)
avg_college -  College grads as a share of employment in k in the US  (source: ACS)
avg_college_wage - Average wage for college grads in industry k, in the US (source: ACS)
industry_share  -  Employment share in US  (source: ACS)
employment_available - total employment from MNEs from s , industry k, in the US (source:BEA)
employment_growth - Change in employment of MNEs from s, industry k in the US from 2004 to 2014 (source:BEA)

*/


*[IN FINAL DRAFT]  Table 1 " Pairwise correlation between home bias and observables" - first column of the correlation matrix below are used to fill the contents of Table 1
pwcorr home_bias gdpperworker non_english distw total_output export_share_toUS comparative_advantage2 avg_college avg_college_wage industry_share employment_available employment_growth, sig


*[IN FINAL DRAFT]  Table B12 "Pairwise correlation between source-nonsource wage difference and observables" - first column of the correlation matrix below are used to fill the contents of Table B12
pwcorr home_bias_wage gdpperworker non_english distw total_output export_share_toUS comparative_advantage2 avg_college avg_college_wage industry_share employment_available employment_growth, sig

restore




****PPML regression*****


use "${replication}\visa_level_data.dta", clear


*Create dataset for PPML

preserve
duplicates drop receiptfy, force
keep receiptfy
tempfile year_ppml
save `year_ppml', replace
restore


preserve
duplicates drop guo_name, force
keep guo_name source_country ind_dol
tempfile guo_ppml
save `guo_ppml', replace
restore




preserve
bysort guo_name receiptfy nationality: egen total_h1b=sum(h1b)
keep guo_name receiptfy nationality total_h1b
duplicates drop guo_name receiptfy nationality, force
tempfile h1b_ppml
save `h1b_ppml', replace
restore



duplicates drop nationality, force
keep nationality

*Create the full interactions between nationality, guo and year
cross using `year_ppml'
cross using `guo_ppml'


merge 1:1 guo_name nationality receiptfy using `h1b_ppml'
drop _m

*Make sure all observations have something for the total h1b hiring
replace total_h1b=0 if total_h1b==.

*Define fixed effects
gen sourcing=(source_country==nationality)
encode source_country, gen(source_country_numeric)
encode nationality, gen(nationality_numeric)
encode ind_dol, gen(industry)
egen clustervar2=group(nationality_numeric source_country_numeric)
egen nationality_industry_time=group(nationality_numeric industry receiptfy)
egen source_industry_time=group(source_country_numeric industry receiptfy)


*[IN FINAL DRAFT] Column 3 of Table B6: "Home bias regressions for employment and wages"
ppmlhdfe total_h1b sourcing if nationality!="other" & source_country!="other", absorb(source_industry_time  nationality_industry_time) noabsorb vce(cluster clustervar2)


***Aggregate Analysis - Appendix B3****


***Open new dataset that is at the source country - nationality - industry level. The years included are only 2012 -2014 and are aggregated into a single time period. This dataset includes the aggregates for h1b visas and l1 visas whenever available. Data is processed in the same way as the visa level data used above, and then aggregated.

use "${replication}\aggregate_level_data.dta", clear




*Define fixed effects
gen sourcing=(source_country==nationality)
encode source_country, gen(source_country_numeric)
encode nationality, gen(nationality_numeric)
egen nationality_industry=group(nationality_numeric ind_dol)
egen source_industry=group(source_country_numeric ind_dol)
egen clustervar2=group(nationality_numeric source_country_numeric)




gen logtotal_l1=log(total_l1)
gen logtotal_h1b=log(total_h1b)
gen logtotal=log(total_l1+total_h1b)



gen cpair_au=(source_country=="AU" & nationality=="AU")
gen cpair_ca=(source_country=="CA" & nationality=="CA")
gen cpair_ch=(source_country=="CH" & nationality=="CH")
gen cpair_cn=(source_country=="CN" & nationality=="CN")
gen cpair_de=(source_country=="DE" & nationality=="DE")
gen cpair_es=(source_country=="ES" & nationality=="ES")
gen cpair_fi=(source_country=="FI" & nationality=="FI")
gen cpair_fr=(source_country=="FR" & nationality=="FR")
gen cpair_gb=(source_country=="GB" & nationality=="GB")
gen cpair_ie=(source_country=="IE" & nationality=="IE")
gen cpair_il=(source_country=="IL" & nationality=="IL")
gen cpair_in=(source_country=="IN" & nationality=="IN")
gen cpair_jp=(source_country=="JP" & nationality=="JP")
gen cpair_kr=(source_country=="KR" & nationality=="KR")
gen cpair_nl=(source_country=="NL" & nationality=="NL")
gen cpair_se=(source_country=="SE" & nationality=="SE")
gen cpair_tw=(source_country=="TW" & nationality=="TW")


capture drop sample*


*Define common sample (o,k,s triplets that have both l1s and h1bs)
quietly reghdfe logtotal_l1 sourcing if nationality!="other" & source_country!="other", absorb(nationality_industry source_industry) vce(cluster clustervar2)
gen sample1=e(sample)
quietly reghdfe logtotal_h1b sourcing if nationality!="other" & source_country!="other", absorb(nationality_industry source_industry) vce(cluster clustervar2)
gen sample3=e(sample)
quietly reghdfe logtotal sourcing if nationality!="other" & source_country!="other", absorb(nationality_industry source_industry) vce(cluster clustervar2)
gen sample6=e(sample)


egen sample100=rowmin(sample*)


quietly reghdfe logtotal_l1 sourcing if nationality!="other" & source_country!="other" & sample100==1, absorb(nationality_industry source_industry) vce(cluster clustervar2)
eststo m1
quietly reghdfe logtotal_h1b sourcing if nationality!="other" & source_country!="other" & sample100==1, absorb(nationality_industry source_industry) vce(cluster clustervar2)
eststo m2
quietly reghdfe logtotal sourcing if nationality!="other" & source_country!="other" & sample100==1, absorb(nationality_industry source_industry) vce(cluster clustervar2)
eststo m3


*[IN FINAL DRAFT]  OLS columns in Table B7: "Home bias: Aggregate regressions for H-1B and L-1"
esttab m1 m2 m3, se nocons  star(* 0.10 ** 0.05  *** 0.01)


*[IN FINAL DRAFT]  Figure B2: "Estimated coefficient (gamma_s) on sourcing regression by country (H-1B vs. L-1)" - coefficients + 95% confidence intervals
reghdfe logtotal_l1 cpair* if nationality!="other" & source_country!="other" & sample100==1, absorb(nationality_industry source_industry) vce(cluster clustervar2)
reghdfe logtotal_h1b cpair* if nationality!="other" & source_country!="other" & sample100==1, absorb(nationality_industry source_industry) vce(cluster clustervar2)



***PPML regressions of aggregate analysis

*Create PPML dataset
preserve
duplicates drop source_country, force
keep source_country
tempfile source_ppml
save `source_ppml', replace
restore

preserve
duplicates drop ind_dol, force
keep ind_dol
tempfile industry_ppml
save `industry_ppml', replace
restore

duplicates drop nationality, force
keep nationality

cross using `source_ppml'
cross using `industry_ppml'


merge 1:1 source_country ind_dol nationality using "${replication}\aggregate_level_data.dta", keepusing(total_h1b total_l1)
drop if _m==2
drop _m


replace total_h1b=0 if total_h1b==.
replace total_l1=0 if total_l1==.
gen total_visas=total_h1b+total_l1

drop if ind_dol==""

gen sourcing=(source_country==nationality)
encode source_country, gen(source_country_numeric)
encode nationality, gen(nationality_numeric)
encode ind_dol, gen(industry)
egen nationality_industry=group(nationality_numeric industry)
egen source_industry=group(source_country_numeric industry)
egen clustervar2=group(nationality_numeric source_country_numeric)


capture drop sample*
quietly ppmlhdfe total_l1 sourcing if source_country!="other", absorb(nationality_industry source_industry) vce(cluster clustervar2)
gen sample=e(sample)
eststo m1
quietly ppmlhdfe total_h1b sourcing if source_country!="other" & sample==1, absorb(nationality_industry source_industry) vce(cluster clustervar2)
eststo m2
quietly ppmlhdfe total_visas sourcing if source_country!="other" & sample==1, absorb(nationality_industry source_industry) vce(cluster clustervar2)
eststo m3

*[IN FINAL DRAFT]  PPML columns in Table B7: "Home bias: Aggregate regressions for H-1B and L-1"
esttab m3 m2 m1, se nocons  star(* 0.10 ** 0.05  *** 0.01)



***Correlation between size and number of applications - Appendix B.4.1 ******


*This section uses two separate datasets to calculate the correlation between measures of firm size and application size_control


*Measure 1 - comparing size from BEA with H-1B applications.


use "${replication}\bea_size.dta", clear

/*This file is at the industry-year-source country level. It includes the following variables:

source_country - source country of the MNE
ind_dol - Industry
receiptfy - year (grouped as in the rest of the analysis)
rev_touse - Revenues from BEA of US affiliates of foreign MNEs
emp_touse - Employment from BEA of US affiliates of foreign MNEs
visas_all - Number of approved H-1B visas
visas_new - Number of approved H-1B visas for new employment
visas_all_oth - Number of approved H-1B visas for non-source country immigrants
visas_new_oth - Number of approved H-1B visas for new employment for non-source country immigrants


*/


*Calculate denominator at the industry-year level
bysort receiptfy ind_dol: egen total_emp_ind=sum(emp_touse)
bysort receiptfy ind_dol: egen total_rev_ind=sum(rev_touse)
bysort receiptfy ind_dol: egen total_visas_oth_ind=sum(visas_all)
bysort receiptfy ind_dol: egen total_visas_oth_new_ind=sum(visas_all)

*Calculate shares by source country for revenues, employment, and visas
gen share_rev=rev_touse/total_rev_ind
gen share_emp=emp_touse/total_emp_ind

gen share_visas_all_oth=visas_all_oth/total_visas_oth_ind
gen share_visas_new_oth=visas_new_oth/total_visas_oth_new_ind





quietly reghdfe share_emp share_visas_all_oth, noabsorb
eststo m1
quietly reghdfe share_rev share_visas_all_oth, noabsorb
eststo m3

*[IN FINAL DRAFT]  Table B9: Correlation between number of applications and size (left panel)
esttab m1 m3 , se r2 nocons
correl share_emp share_visas_all_oth
correl share_rev share_visas_all_oth


*Measure 2 - comparing size from Orbis and H-1B applications


/*

use "${replication}\orbis_size.dta", clear

/*

Not included in replication package because it includes propietary data from Orbis

This file has one observation per GUO firm that had available total number of employees in Orbis.
The variables are:

guo_name: Name of GUO firm
ind_dol_3: 3 digit Industry from Orbis 
avg_guo_employees: Average number of employees across years reported 
average_visas: Average number of H-1B visas approved among all years firm had any H-1Bs approved 
average_visas_new: Average number of H-1B visas for new employment approved among all years firm had any H-1Bs approved 
average_visas_oth : Average number of H-1B visas for non-source immigrants approved among all years firm had any H-1Bs approved 
average_visas_othnew: Average number of H-1B visas for new employment for non-source immigrants approved among all years firm had any H-1Bs approved  
source_country_alt: Source country including US MNE/non-MNE breakdown 



*/

*First residualize number of employees and number of visas from industry components
quietly reghdfe avg_guo_employees, absorb(ind_dol_3) resid
predict guo_emp_noind, resid
quietly reghdfe average_visas_oth, absorb(ind_dol_3) resid
predict visa_oth_noind, resid
quietly reghdfe average_visas, absorb(ind_dol_3) resid
predict visa_noind, resid


quietly reghdfe guo_emp_noind visa_oth_noind, noabsorb
eststo m1
quietly reghdfe guo_emp_noind visa_oth_noind if source_country_alt=="US MNE" | source_country_alt=="US nonMNE", noabsorb
eststo m2
quietly reghdfe guo_emp_noind visa_oth_noind if source_country_alt!="US MNE" & source_country_alt!="US nonMNE", noabsorb
eststo m3


*[IN FINAL DRAFT]  Table B9: Correlation between number of applications and size (right panel)
esttab m1 m2 m3, se r2 nocons
correl guo_emp_noind visa_oth_noind
correl guo_emp_noind visa_oth_noind if source_country_alt=="US MNE" | source_country_alt=="US nonMNE"
correl guo_emp_noind visa_oth_noind if source_country_alt!="US MNE" & source_country_alt!="US nonMNE"

*/

*****Analysis on wages - Section 3.2*********

use "${replication}\visa_level_data.dta", clear

gen lwage=log(wage_num)

*Correct entries of countries that have multiple names
replace countryofbirth="KOREA SOUTH" if countryofbirth=="KOREA"
replace countryofbirth="RUSSIA" if countryofbirth=="USSR" 


*Create fixed effects
gen sourcing=(source_country==nationality)
replace sourcing=0 if source_country=="other" & nationality=="other"
encode source_country, gen(source_country_numeric)
encode nationality, gen(nationality_numeric)
encode ind_dol, gen(industry)
encode countryofbirth, gen(nationality_det_numeric)
egen nationality_industry=group(nationality_numeric industry)
egen source_industry=group(source_country_numeric industry)
egen clustervar2=group(nationality_numeric source_country_numeric)
gen firm_numeric=guo_name
egen firm_time=group(firm_numeric receiptfy)
egen nationality_time=group(nationality_det_numeric receiptfy)
egen nationality_industry_time=group(nationality_det_numeric industry receiptfy)
egen source_industry_time=group(source_country_numeric industry receiptfy)
egen occ_time=group(iscodesc receiptfy)


gen cpair_au=(source_country=="AU" & nationality=="AU")
gen cpair_ca=(source_country=="CA" & nationality=="CA")
gen cpair_ch=(source_country=="CH" & nationality=="CH")
gen cpair_cn=(source_country=="CN" & nationality=="CN")
gen cpair_de=(source_country=="DE" & nationality=="DE")
gen cpair_es=(source_country=="ES" & nationality=="ES")
gen cpair_fi=(source_country=="FI" & nationality=="FI")
gen cpair_fr=(source_country=="FR" & nationality=="FR")
gen cpair_gb=(source_country=="GB" & nationality=="GB")
gen cpair_ie=(source_country=="IE" & nationality=="IE")
gen cpair_il=(source_country=="IL" & nationality=="IL")
gen cpair_in=(source_country=="IN" & nationality=="IN")
gen cpair_jp=(source_country=="JP" & nationality=="JP")
gen cpair_kr=(source_country=="KR" & nationality=="KR")
gen cpair_nl=(source_country=="NL" & nationality=="NL")
gen cpair_se=(source_country=="SE" & nationality=="SE")
gen cpair_tw=(source_country=="TW" & nationality=="TW")






quietly reghdfe lwage sourcing, absorb(nationality_industry_time source_industry_time occ_time)
eststo m1
quietly reghdfe lwage sourcing, absorb(nationality_industry_time firm_time occ_time)
eststo m2

*[IN FINAL DRAFT]  Columns 6 and 7 of Table B6: "Home bias regressions for employment and wages"
esttab m1 m2, se nocons star(* 0.10 ** 0.05  *** 0.01)


*Run main wage regression and store fixed effects to be used later.

*[IN FINAL DRAFT]  Figure 2: "Estimated coefficient (beta_s) on wage regression by country (H-1B)"
reghdfe lwage cpair*, absorb(nationality_industry_time source_industry_time occ_time, savefe)
gen sample1=e(sample)


**********Get origin FEs*****************

preserve
*I compute the average fixed effect by origin country by weighting the industry-year pairs by number of observations
bysort nationality_det_numeric industry receiptfy: egen num_okt=sum(sample1)
bysort nationality_det_numeric: egen num_o=sum(sample1)
*Compute weights
gen share_okt=num_okt/num_o


duplicates drop nationality_det_numeric industry receiptfy, force

*Compute weighted average of origin fixed effects
bysort nationality_det_numeric: egen origin_fe=sum(share_okt*__hdfe1__)

duplicates drop nationality_det_numeric, force

*Keep origin fixed effects to be used later
keep countryofbirth num_o origin_fe


*Save FEs in a separate file to be used later
*save "${replication}\origin_fe.dta", replace

restore



*Analysis by industry


gen ind_sourcing_finance=(source_country==nationality)*(industry==1)
gen ind_sourcing_information=(source_country==nationality)*(industry==2)
gen ind_sourcing_chemicals=(source_country==nationality)*(industry==3)
gen ind_sourcing_computers=(source_country==nationality)*(industry==4)
gen ind_sourcing_electrical=(source_country==nationality)*(industry==5)
gen ind_sourcing_food=(source_country==nationality)*(industry==6)
gen ind_sourcing_machinery=(source_country==nationality)*(industry==7)
gen ind_sourcing_metals=(source_country==nationality)*(industry==8)
gen ind_sourcing_transportation=(source_country==nationality)*(industry==9)
gen ind_sourcing_other=(source_country==nationality)*(industry==10)
gen ind_sourcing_professionalsvcs=(source_country==nationality)*(industry==11)
gen ind_sourcing_realestate=(source_country==nationality)*(industry==12)
gen ind_sourcing_retail=(source_country==nationality)*(industry==13)
gen ind_sourcing_wholesale=(source_country==nationality)*(industry==14)

*[IN FINAL DRAFT]  Figure B5: "Estimated coefficient (beta_k) by industry"
reghdfe lwage ind_sourcing_*, absorb(nationality_industry_time source_industry_time occ_time)


*Correlation of home bias for wages and observables following the same process as employment:

reghdfe lwage i.industry##cpair*, absorb(nationality_industry_time source_industry_time occ_time) vce(cluster clustervar2)


*I copy to excel this regression and use this to calculate industry-country home bias to be imported in next step
*Add each cpair fixed effect to the corresponding industry#country fixed effect to obtain the country-industry home bias parameter. Then save the home-bias parameters. Calculations with wages done above together with employment results


**************Results for Figure 3**********

use "${replication}\origin_stats.dta", clear

/*
I construct this datset using the origin fixed effects estimated above which are in the variable "cb_fe". I normalize the fixed effects to set India's FE to 0. The normalized ones are called "cb_fe_normalized". The number of observations used to compute the fixed effects are in the variable "cb_count". I keep only fixed effects that were estimated with more than 250 observations. The remaining variables in the dataset are publicly available information at the country of origin level as follows:


CountryName - origin country o
coll_share - Share of workers with a tertiary education in o (source: World Bank)
gdp_pc - GDP per capita in o (source: World Bank)
gdp_pw - GDP per worker in o (source: World Bank)
tot_pop - Total Population in o (source: World Bank)
tot_immig - Total number of college educated immigrants from country o in the US (source: IAB Brain Drain database)
coll_pop - number of college graduates in o: coll_share*tot_pop/100
share_us - Immigrants from o in US relative to total college population in o: tot_immig/coll_pop


*/


gen gdp_pw_th=gdp_pw/1000
gen fe_pct=cb_fe_normalized*100

gen label_to_use=""
replace label_to_use="MEX" if CountryName=="Mexico"
replace label_to_use="CAN" if CountryName=="Canada"
replace label_to_use="ITA" if CountryName=="Italy"
replace label_to_use="FRA" if CountryName=="France"
replace label_to_use="GBR" if CountryName=="United Kingdom"
replace label_to_use="SGP" if CountryName=="Singapore"
replace label_to_use="IND" if CountryName=="India"
replace label_to_use="DNK" if CountryName=="Denmark"
replace label_to_use="ARG" if CountryName=="Argentina"
replace label_to_use="CHN" if CountryName=="China"
replace label_to_use="BRA" if CountryName=="Brazil"
replace label_to_use="GER" if CountryName=="Germany"
replace label_to_use="ESP" if CountryName=="Spain"
replace label_to_use="CHL" if CountryName=="Chile"
replace label_to_use="NOR" if CountryName=="Norway"
replace label_to_use="JPN" if CountryName=="Japan"
replace label_to_use="KOR" if CountryName=="Korea"
replace label_to_use="COL" if CountryName=="Colombia"
replace label_to_use="CAM" if CountryName=="Cameroon"
replace label_to_use="IRE" if CountryName=="Ireland"
replace label_to_use="PHL" if CountryName=="Philippines"
replace label_to_use="JAM" if CountryName=="Jamaica"
replace label_to_use="NPL" if CountryName=="Nepal"
replace label_to_use="UAE" if CountryName=="United Arab Emirates"
replace label_to_use="SAU" if CountryName=="Saudi Arabia"
replace label_to_use="NZL" if CountryName=="New Zealand"
replace label_to_use="CHN" if CountryName=="China"
replace label_to_use="KWT" if CountryName=="Kuwait"
replace label_to_use="HKG" if CountryName=="Hong Kong"
replace label_to_use="CHE" if CountryName=="Switzerland"
replace label_to_use="HND" if CountryName=="Honduras"
replace label_to_use="CZE" if CountryName=="Czech Republic"



gen label_position=3
replace label_position=6 if label_to_use=="CAM" |  label_to_use=="IND"  
replace label_position=2 if label_to_use=="GHA"
replace label_position=12 if label_to_use=="COL" | label_to_use=="FRA"
replace label_position=4 if label_to_use=="GBR" |  label_to_use=="ITA"
replace label_position=7 if label_to_use=="MEX"

corr gdp_pw_th fe_pct  if cb_count>400
local corr : di %4.2f r(rho)

*[IN FINAL DRAFT]  Figure 3: "Wage differences in US across origin countries" - GDP per worker
twoway (scatter gdp_pw_th fe_pct, mlabel(label_to_use) ytitle("GDP per worker (000s)") xtitle("Observed wage differential wrt India (%)") graphregion(fcolor(white) lcolor(white)) legend(off) mco(black) mfc(none) mlabc(black) mlabvpos(label_position) mlabg(0.2pt) mlabs(small)) (lfit gdp_pw_th fe_pct, subtitle(correlation=`corr') lcolor(black) lpattern(dash)) if cb_count>400


replace label_to_use="" if CountryName=="Italy"
replace label_to_use="" if CountryName=="Argentina"
replace label_to_use="" if CountryName=="Norway"
replace label_to_use="" if CountryName=="Chile"
replace label_to_use="" if CountryName=="Singapore"
replace label_to_use="AUS" if CountryName=="Australia"
replace label_to_use="GHA" if CountryName=="Ghana"
replace label_to_use="PRT" if CountryName=="Portugal"


gen label_position2=3
replace label_position2=4 if label_to_use=="KOR" | label_to_use=="BRA" | label_to_use=="CHE" | label_to_use=="CZE"
replace label_position2=6 if label_to_use=="CHN" | label_to_use=="FRA"


corr share_us fe_pct if cb_count>400
local corr : di %4.2f r(rho)

*[IN FINAL DRAFT]  Figure 3: "Wage differences in US across origin countries" -  Share of college graduates from o in US
twoway (scatter share_us fe_pct, mlabel(label_to_use) ytitle("Share of college grads from origin in US") xtitle("Observed wage differential wrt India (%)") graphregion(fcolor(white) lcolor(white)) legend(off) graphregion(fcolor(white) lcolor(white)) legend(off) mco(black) mfc(none) mlabc(black) mlabvpos(label_position2) mlabg(0.2pt) mlabs(small)) (lfit share_us fe_pct, subtitle(correlation=`corr')  lcolor(black) lpattern(dash)) if cb_count>400 




***********ESTIMATION SECTION***********************



use "${replication}\visa_level_data.dta", clear


*Estimation of labor supply elasticity Kappa


*Pool years in groups of 2
gen year=.
replace year=2002 if original_year==2002 | original_year==2001
replace year=2004 if original_year==2003 | original_year==2004
replace year=2006 if original_year==2005 | original_year==2006
replace year=2008 if original_year==2007 | original_year==2008
replace year=2010 if original_year==2009 | original_year==2010
replace year=2012 if original_year==2011 | original_year==2012
replace year=2014 if original_year==2013 | original_year==2014



*Use detailed versions of source-country and nationality 
gen industry=industry_orig
drop source_country nationality
rename source_country_det source_country
rename nationality_det nationality


*calculate aggregates
gen sourcing=(nationality==source_country)

bysort year source_country industry nationality: egen average_wage=mean(wage_num)
bysort year source_country industry nationality: egen employment=sum(1)
bysort year source_country industry nationality: egen dev_wage_population=sd(wage_num)


bysort source_country year industry: egen total_wage_bill=sum(wage_num)
bysort source_country year industry: egen total_employment=sum(1)

bysort source_country year industry: egen own_wage_bill=sum(wage_num*sourcing)
bysort source_country year industry: egen own_employment=sum(sourcing)

bysort source_country year industry nationality: egen o_wage_bill=sum(wage_num)
bysort source_country year industry nationality: egen o_employment=sum(1)

gen laverage_wage=log(average_wage)
gen lemployment=log(employment)



duplicates drop year industry nationality source_country, force




egen fixedeffects_tok=group(year industry nationality)
egen fixedeffects_tsk=group(year industry source_country)
egen fixedeffects_tskx=group(year industry source_country sourcing)
egen group_cluster2=group(year industry source_country)



*merge m:1 year source_country industry using "${path}Estimating supply elasticity\WIOT files\wiot_to_use.dta", keepusing(exports_non_US exp_byind_non_US)
merge m:1 year source_country industry using "${replication}\aux_vars_estimation_wiot.dta", keepusing(exports_non_US exp_byind_non_US)


drop if _m==2
drop if _m==1 & (source_country=="SG" | source_country=="HK" | source_country=="IL")
drop _m


gen exports_non_US1=exports_non_US/exp_byind_non_US 


bysort year nationality: egen tot_nationality=sum(employment)
gen share01=employment/tot_nationality if year==2002
bysort nationality industry source_country: egen share_01=mean(share01)


gen instrument1=share_01*exports_non_US1


*******Estimating kappa***********


quietly ivreghdfe laverage_wage (lemployment=instrument1) if employment>5 & year!=2002, first absorb(fixedeffects_tok fixedeffects_tskx)  cluster(group_cluster2)
capture gen sample=e(sample)


quietly reghdfe lemployment instrument1 if sample==1, absorb(fixedeffects_tok fixedeffects_tskx)  cluster(group_cluster2)
eststo m1
quietly ivreghdfe laverage_wage (lemployment=instrument1) if sample==1, absorb(fixedeffects_tok fixedeffects_tskx)  cluster(group_cluster2)
eststo m2

local elasticity=-1/round(_b[lemployment], 0.001)
dis `elasticity'

*To extract standard error
nlcom -1/_b[lemployment]



*6.1842107
quietly reghdfe laverage_wage lemployment if sample==1, absorb(fixedeffects_tok fixedeffects_tskx)  cluster(group_cluster2)
eststo m3
local elasticity_ols=-1/round(_b[lemployment], 0.001)
dis `elasticity_ols'


*[IN FINAL DRAFT]  - Table D13: Estimates of equation (42)
esttab m1 m2 m3, se  nocons star(* 0.10 ** 0.05  *** 0.01)

***************Estimate correlation and dispersion separately****************************

capture gen ratio=(dev_wage_population/average_wage)^2

*[IN FINAL DRAFT]  - Table D14: Estimates for kappa using dispersion of wages
gmm (ratio - exp(lngamma(1-2/({theta}^2/`elasticity')))/(exp(lngamma(1-1/({theta}^2/`elasticity'))))+1) if sample==1, from(theta 2.08) winitial(identity)

local theta=_b[/theta]
dis 1-`theta'/`elasticity'

gmm (ratio - exp(lngamma(1-2/{theta}))/(exp(lngamma(1-1/{theta})))+1) if sample==1, from(theta 2.08) winitial(identity)

local theta=_b[/theta]
dis `theta'



**************Estimating iota*********************************************

gen avgwage_ratio=log((own_wage_bill/own_employment)/(o_wage_bill/o_employment))
*If fewer than 5 onservations in cell, replace denominator by mean in source-industry across all nationalities to reduce noise in wage ratios
replace avgwage_ratio=log((own_wage_bill/own_employment)/((total_wage_bill-own_wage_bill)/(total_employment-own_employment))) if o_employment<5


gen emp_ratio=log(own_employment/o_employment)


sum fixedeffects_tok 
local max_nationality_industry=r(max)

sum fixedeffects_tsk
local max_source_industry=r(max)

gen tsk_fe=.
gen tok_fe=.

*Create dependent variable of equation 47
gen dependent_variable=avgwage_ratio+(1/`elasticity')*emp_ratio if source_country!=nationality



*Run regression and store fixed effects
quietly reg dependent_variable i.fixedeffects_tsk i.fixedeffects_tok  

forval x=2/`max_source_industry' {

capture local source_fe=_b[`x'.fixedeffects_tsk]
capture replace tsk_fe=`source_fe' if fixedeffects_tsk==`x'

}

forval x=2/`max_nationality_industry' {

capture local nationality_fe=_b[`x'.fixedeffects_tok]
capture replace tok_fe=`nationality_fe'  if fixedeffects_tok==`x'

}



replace tok_fe=. if tok_fe==0
replace tsk_fe=. if tok_fe==.

replace tsk_fe=. if tsk_fe==0
replace tok_fe=. if tsk_fe==.




gen term_subtract1=tok_fe if source_country==nationality
bysort industry year source_country: egen term_subtract=mean(term_subtract1)

*Calculate ratio of effective units
gen lwage_ratio=tsk_fe+term_subtract if term_subtract!=. & tsk_fe!=.
keep if lwage_ratio!=.


duplicates drop source_country industry year, force
keep source_country industry year lwage_ratio


tempfile effective_units
save `effective_units', replace


*Re-open visa level dataset
use "${replication}\visa_level_data.dta", clear

*Pool years in groups of 2
gen year=.
replace year=2002 if original_year==2002 | original_year==2001
replace year=2004 if original_year==2003 | original_year==2004
replace year=2006 if original_year==2005 | original_year==2006
replace year=2008 if original_year==2007 | original_year==2008
replace year=2010 if original_year==2009 | original_year==2010
replace year=2012 if original_year==2011 | original_year==2012
replace year=2014 if original_year==2013 | original_year==2014



*Use detailed versions of source-country and nationality 
gen industry=industry_orig
drop source_country nationality
rename source_country_det source_country
rename nationality_det nationality

gen sourcing=(nationality==source_country)


*Calculate aggregates for regression at the firm level
bysort guo_name industry year: egen total_own_wagebill=sum(wage_num*sourcing)
bysort guo_name industry year: egen total_oth_wagebill=sum(wage_num*(1-sourcing))

bysort guo_name industry year: egen total_own_employment=sum(sourcing)
bysort guo_name industry year: egen total_oth_employment=sum(1-sourcing)

duplicates drop guo_name industry year, force

keep guo_name source_country year industry total_oth_employment total_own_employment total_oth_wagebill total_own_wagebill


gen lwagebillratio=(total_own_wagebill/total_oth_wagebill)

*Keep only MNEs
keep if source_country!="US" & source_country!="other"

gen sample=(total_own_employment>10 & total_oth_employment>10)


merge m:1 year source_country industry using `effective_units'
drop if _m==2
drop _m

rename lwage_ratio leffective_wages

encode industry, gen(industry_numeric)
encode source_country, gen(source_country_numeric)
gen india=(source_country=="IN")

reg lwagebillratio leffective_wages india


*merge m:1 year source_country using "${path}Estimating supply elasticity\gdppercapita.dta"
merge m:1 year source_country using "${replication}\aux_vars_estimation_gdppw.dta", keepusing(gdpperworker)


drop if _m==2
 drop _m



gen lgdpperworker=log(gdpperworker)


egen ts_group=group(source_country year)
egen tk_group=group(industry year)

 
*merge m:1 source_country industry year using "${path}Estimating supply elasticity\shift_share_instruments.dta"
merge m:1 source_country industry year using "${replication}\aux_vars_estimation_inst.dta", keepusing(shift_share_own4)
keep if _m==3
 
 
*Find common sample
quietly ivreghdfe lwagebillratio (leffective_wages=shift_share_own4) if source_country!="IN" & year!=2002, first absorb(tk_group) cluster (ts_group)
gen sample1=e(sample)

quietly ivreghdfe lwagebillratio (leffective_wages=lgdpperworker) if source_country!="IN", first absorb(tk_group) cluster (ts_group)
gen sample2=e(sample)



quietly reghdfe lwagebillratio leffective_wages if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group) 
eststo m1
quietly reghdfe leffective_wages shift_share_own4 if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group)
eststo m2
quietly ivreghdfe lwagebillratio (leffective_wages=shift_share_own4) if sample1==1 & sample2==1, first absorb(tk_group) cluster (ts_group)
eststo m3
quietly reghdfe leffective_wages lgdpperworker if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group)
eststo m4
quietly ivreghdfe lwagebillratio (leffective_wages=lgdpperworker) if sample1==1 & sample2==1, first absorb(tk_group) cluster (ts_group)
eststo m5
quietly reghdfe leffective_wages lgdpperworker shift_share_own4 if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group)
eststo m6
quietly ivreghdfe lwagebillratio (leffective_wages=lgdpperworker shift_share_own4) if sample1==1 & sample2==1, first absorb(tk_group) cluster (ts_group)
eststo m7


*[IN FINAL DRAFT]  Table D15: Estimating equation for iota - Second stages - For SE need to run the bootstrap routine below
esttab m1 m3 m5 m7, se  nocons star(* 0.10 ** 0.05  *** 0.01)


*[IN FINAL DRAFT]  Table D15: Estimating equation for iota - First stages - For SE need to run the bootstrap routine below
esttab m2 m4 m6, se  nocons star(* 0.10 ** 0.05  *** 0.01)




****************Bootstrap routine for standard errors in Table D15**********************

/* Comment out to run, takes about XX hours*/


/*
set seed 123
local nreps=200

forval z=1/`nreps' {
	
use "${replication}\visa_level_data.dta", clear
	
	bsample
	
tempfile individual`z'
save `individual`z'', replace



*Estimation of labor supply elasticity Kappa


*Pool years in groups of 2
gen year=.
replace year=2002 if original_year==2002 | original_year==2001
replace year=2004 if original_year==2003 | original_year==2004
replace year=2006 if original_year==2005 | original_year==2006
replace year=2008 if original_year==2007 | original_year==2008
replace year=2010 if original_year==2009 | original_year==2010
replace year=2012 if original_year==2011 | original_year==2012
replace year=2014 if original_year==2013 | original_year==2014



*Use detailed versions of source-country and nationality 
gen industry=industry_orig
drop source_country nationality
rename source_country_det source_country
rename nationality_det nationality


*calculate aggregates
gen sourcing=(nationality==source_country)

bysort year source_country industry nationality: egen average_wage=mean(wage_num)
bysort year source_country industry nationality: egen employment=sum(1)
bysort year source_country industry nationality: egen dev_wage_population=sd(wage_num)


bysort source_country year industry: egen total_wage_bill=sum(wage_num)
bysort source_country year industry: egen total_employment=sum(1)

bysort source_country year industry: egen own_wage_bill=sum(wage_num*sourcing)
bysort source_country year industry: egen own_employment=sum(sourcing)

bysort source_country year industry nationality: egen o_wage_bill=sum(wage_num)
bysort source_country year industry nationality: egen o_employment=sum(1)

gen laverage_wage=log(average_wage)
gen lemployment=log(employment)



duplicates drop year industry nationality source_country, force




egen fixedeffects_tok=group(year industry nationality)
egen fixedeffects_tsk=group(year industry source_country)
egen fixedeffects_tskx=group(year industry source_country sourcing)
egen group_cluster2=group(year industry source_country)



*merge m:1 year source_country industry using "${path}Estimating supply elasticity\WIOT files\wiot_to_use.dta", keepusing(exports_non_US exp_byind_non_US)
merge m:1 year source_country industry using "${replication}\aux_vars_estimation_wiot.dta", keepusing(exports_non_US exp_byind_non_US)


drop if _m==2
drop if _m==1 & (source_country=="SG" | source_country=="HK" | source_country=="IL")
drop _m


gen exports_non_US1=exports_non_US/exp_byind_non_US 


bysort year nationality: egen tot_nationality=sum(employment)
gen share01=employment/tot_nationality if year==2002
bysort nationality industry source_country: egen share_01=mean(share01)


gen instrument1=share_01*exports_non_US1


*******Estimating kappa***********


quietly ivreghdfe laverage_wage (lemployment=instrument1) if employment>5 & year!=2002, first absorb(fixedeffects_tok fixedeffects_tskx)  cluster(group_cluster2)

local elasticity=-1/_b[lemployment]
dis `elasticity'



**************Estimating iota*********************************************

gen avgwage_ratio=log((own_wage_bill/own_employment)/(o_wage_bill/o_employment))
*If fewer than 5 onservations in cell, replace denominator by mean in source-industry across all nationalities to reduce noise in wage ratios
replace avgwage_ratio=log((own_wage_bill/own_employment)/((total_wage_bill-own_wage_bill)/(total_employment-own_employment))) if o_employment<5


gen emp_ratio=log(own_employment/o_employment)


sum fixedeffects_tok 
local max_nationality_industry=r(max)

sum fixedeffects_tsk
local max_source_industry=r(max)

gen tsk_fe=.
gen tok_fe=.

*Create dependent variable of equation 47
gen dependent_variable=avgwage_ratio+(1/`elasticity')*emp_ratio if source_country!=nationality



*Run regression and store fixed effects
quietly reg dependent_variable i.fixedeffects_tsk i.fixedeffects_tok  

forval x=2/`max_source_industry' {

capture local source_fe=_b[`x'.fixedeffects_tsk]
capture replace tsk_fe=`source_fe' if fixedeffects_tsk==`x'

}

forval x=2/`max_nationality_industry' {

capture local nationality_fe=_b[`x'.fixedeffects_tok]
capture replace tok_fe=`nationality_fe'  if fixedeffects_tok==`x'

}



replace tok_fe=. if tok_fe==0
replace tsk_fe=. if tok_fe==.

replace tsk_fe=. if tsk_fe==0
replace tok_fe=. if tsk_fe==.




gen term_subtract1=tok_fe if source_country==nationality
bysort industry year source_country: egen term_subtract=mean(term_subtract1)

*Calculate ratio of effective units
gen lwage_ratio=tsk_fe+term_subtract if term_subtract!=. & tsk_fe!=.
keep if lwage_ratio!=.


duplicates drop source_country industry year, force
keep source_country industry year lwage_ratio


tempfile effective_units
save `effective_units', replace


*Re-open visa level dataset
use `individual`z'', clear

*Pool years in groups of 2
gen year=.
replace year=2002 if original_year==2002 | original_year==2001
replace year=2004 if original_year==2003 | original_year==2004
replace year=2006 if original_year==2005 | original_year==2006
replace year=2008 if original_year==2007 | original_year==2008
replace year=2010 if original_year==2009 | original_year==2010
replace year=2012 if original_year==2011 | original_year==2012
replace year=2014 if original_year==2013 | original_year==2014



*Use detailed versions of source-country and nationality 
gen industry=industry_orig
drop source_country nationality
rename source_country_det source_country
rename nationality_det nationality

gen sourcing=(nationality==source_country)


*Calculate aggregates for regression at the firm level
bysort guo_name industry year: egen total_own_wagebill=sum(wage_num*sourcing)
bysort guo_name industry year: egen total_oth_wagebill=sum(wage_num*(1-sourcing))

bysort guo_name industry year: egen total_own_employment=sum(sourcing)
bysort guo_name industry year: egen total_oth_employment=sum(1-sourcing)

duplicates drop guo_name industry year, force

keep guo_name source_country year industry total_oth_employment total_own_employment total_oth_wagebill total_own_wagebill


gen lwagebillratio=(total_own_wagebill/total_oth_wagebill)

*Keep only MNEs
keep if source_country!="US" & source_country!="other"

gen sample=(total_own_employment>10 & total_oth_employment>10)


merge m:1 year source_country industry using `effective_units'
drop if _m==2
drop _m

rename lwage_ratio leffective_wages

encode industry, gen(industry_numeric)
encode source_country, gen(source_country_numeric)
gen india=(source_country=="IN")

reg lwagebillratio leffective_wages india


*merge m:1 year source_country using "${path}Estimating supply elasticity\gdppercapita.dta"
merge m:1 year source_country using "${replication}\aux_vars_estimation_gdppw.dta", keepusing(gdpperworker)


drop if _m==2
 drop _m



gen lgdpperworker=log(gdpperworker)


egen ts_group=group(source_country year)
egen tk_group=group(industry year)

 
*merge m:1 source_country industry year using "${path}Estimating supply elasticity\shift_share_instruments.dta"
merge m:1 source_country industry year using "${replication}\aux_vars_estimation_inst.dta", keepusing(shift_share_own4)
keep if _m==3
 
 
*Find common sample
quietly ivreghdfe lwagebillratio (leffective_wages=shift_share_own4) if source_country!="IN" & year!=2002, first absorb(tk_group) cluster (ts_group)
gen sample1=e(sample)

quietly ivreghdfe lwagebillratio (leffective_wages=lgdpperworker) if source_country!="IN", first absorb(tk_group) cluster (ts_group)
gen sample2=e(sample)


* OLS
quietly reghdfe lwagebillratio leffective_wages if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group) 
local beta2=_b[leffective_wages]
gen beta2=`beta2'

* Supply push instrument
quietly reghdfe leffective_wages shift_share_own4 if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group)
local beta1_1=_b[shift_share_own4]
gen beta1_1=`beta1_1'
quietly ivreghdfe lwagebillratio (leffective_wages=shift_share_own4) if sample1==1 & sample2==1, first absorb(tk_group) cluster (ts_group)
local beta2_1=_b[leffective_wages]
gen beta2_1=`beta2_1'

*GDP per capita instrument
quietly reghdfe leffective_wages lgdpperworker if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group)
local beta1_2=_b[lgdpperworker]
gen beta1_2=`beta1_2'
quietly ivreghdfe lwagebillratio (leffective_wages=lgdpperworker) if sample1==1 & sample2==1, first absorb(tk_group) cluster (ts_group)
local beta2_2=_b[leffective_wages]
gen beta2_2=`beta2_2'

*Both instruments at the same time
quietly reghdfe leffective_wages lgdpperworker shift_share_own4 if sample1==1 & sample2==1, absorb(tk_group) cluster (ts_group)
local beta1_3=_b[lgdpperworker]
gen beta1_3=`beta1_3'
local beta1_4=_b[shift_share_own4]
gen beta1_4=`beta1_4'
quietly ivreghdfe lwagebillratio (leffective_wages=lgdpperworker shift_share_own4) if sample1==1 & sample2==1, first absorb(tk_group) cluster (ts_group)
local beta2_3=_b[leffective_wages]
gen beta2_3=`beta2_3'
local fstat_3=e(widstat)
gen fstat_3=`fstat_3'



keep beta2 beta1_1 beta2_1 beta1_2 beta2_2 beta1_3 beta1_4 beta2_3 fstat_3
duplicates drop beta2 beta1_1 beta2_1 beta1_2 beta2_2  beta1_3 beta1_4 beta2_3 fstat_3, force

tempfile boot`z'
save `boot`z'', replace
	
}


local nreps2=`nreps'
local nreps1=`nreps2'-1

forval z=1/`nreps1' {

append using `boot`z''

}


/*Order of coefficients 
beta2: OLS
beta1_1: 1st stage supply push
beta2_1: 2nd stage supply push
beta1_2: 1st stage GDP per worker 
beta2_2: 2nd stage GDP per worker  
beta1_3: 1st stage both instruments - coeff on GDP per worker 
beta1_4: 1st stage both instruments - supply push  
beta2_3: 2nd stage both instruments
fstat_3: KP Fstat of joint significance of instruments
*/

*Take the standard deviations reported in Table below except for fstat_3, where I take the mean
sum beta2 beta1_1 beta2_1 beta1_2 beta2_2 beta1_3 beta1_4 beta2_3 fstat_3


save "${replication}\bootstrap_sample.dta", replace


*/

